package com.datahole.suffixarray.Main;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;

import com.datahole.suffixarray.model.InitTuple;
import com.datahole.suffixarray.model.ResultTuple;
import com.datahole.suffixarray.model.Suffix;
import com.datahole.suffixarray.util.StringUtil;
import com.google.common.collect.Multimap;

/**
 * 重复串相关信息统计
 * 
 * @author hlyue
 * 
 */

public class Statistic {

	private Map<String, ResultTuple> map = null;

	public Statistic() {
		map = new HashMap<String, ResultTuple>();
	}

	/**
	 * 利用字符串的后缀数组，统计文本中重复串的数量，并求出重复串的右邻接类
	 * 
	 * @param text
	 * @param suffix
	 * @param map
	 * @param isReverse
	 * @throws IOException
	 */
	public void calcWordCount(String text, Suffix suffix, boolean isReverse) {
		Stack<ResultTuple> s = new Stack<ResultTuple>();
		int i = 1;
		while (i < suffix.heightArray.length) {
			List<Integer> al = new ArrayList<Integer>(2);
			if (s.isEmpty()) {
				if (suffix.heightArray[i] != 0) {
					al.add(1);
					al.add(1);
					s.push(new ResultTuple(i, 2, al));
				}
				i++;
			} else {
				ResultTuple temp = s.pop();
				if (suffix.heightArray[temp.id] < suffix.heightArray[i]) {
					s.push(temp);
					al.add(1);
					al.add(1);
					s.push(new ResultTuple(i, 2, al));
					i++;
					continue;
				}
				if (suffix.heightArray[temp.id] == suffix.heightArray[i]) {
					temp.frequncy++;
					al.add(1);
					s.push(temp);
					i++;
					continue;
				}
				if (suffix.heightArray[temp.id] > suffix.heightArray[i]) {
					if (suffix.heightArray[temp.id] < 15) { // 解决异常StringIndexOutOfBoundsException
						String sb = text.substring(suffix.getSa()[temp.id],
								suffix.getSa()[temp.id]
										+ suffix.heightArray[temp.id]);
						if (sb.length() < 10) {
							if (!isReverse)
								// map.put(sb, temp);
								this.fillMap(sb, temp);

							else
								// map.put(new
								// StringBuffer(sb).reverse().toString(), temp);
								this.fillMap(new StringBuffer(sb).reverse()
										.toString(), temp);
						}
					}

					int fre = temp.frequncy;
					if (!s.isEmpty()) {
						ResultTuple t = s.pop();
						if (suffix.heightArray[i] > suffix.heightArray[t.id]) // 修改了2012-7-25发现的一些字符串频度值有误的问题
						{
							s.push(t);
							al.add(1);
							s.push(new ResultTuple(i, fre + 1, al));
							i++;
							continue;
						} else {
							t.frequncy += (fre - 1);
							// int sa = suffix.getSa()[temp.id];
							// int ha = suffix.heightArray[t.id];
							// String key = text.substring(sa + ha, sa + ha +
							// 1);
							if (t.av.size() > 0) {
								int x = t.av.get(t.av.size() - 1);
								x += (fre - 1);
								t.av.set(t.av.size() - 1, x);
								s.push(t);
							}

						}
					} else {
						if (suffix.heightArray[i] != 0) {
							al.add(temp.frequncy);
							s.push(new ResultTuple(i, fre, al));
						}
					}
				}

			}
			al = null;
		}
	}

	/**
	 * 对于分割为多个小文件的文本，需要判断map中是否已经在前面统计中出现过，若出现过，则累加。
	 * 
	 * @param sb
	 * @param temp
	 * @param map
	 */
	private void fillMap(String word, ResultTuple temp) {
		int length = word.length();

		// 跨行重复串
		if (word.contains("$"))
			return;

		// int[] pos = StringUtil.validateSubString(word);
		// word = word.substring(pos[0]);

		// 重复串为数字串
		if (StringUtil.isNumberic(word))
			return;

		// 首或尾为标点的重复串
		if (StringUtil.isDotString(word.substring(0, 1))
				|| StringUtil.isDotString(word.substring(length - 1, length)))
			return;

		// 含日文字符的重复串
		// if (StringUtil.containJapanese(word))
		// return;

		if (StringUtil.isNotChDiEnStr(word))
			return;

		if (filterByRule(word))
			return;

		this.map.put(word, temp);

	}

	/**
	 * 把文本正序与文本反序计算出的重复串取交集
	 * 
	 * @param r_map
	 * @param l_map
	 * @return key为重复的字符串，value参考@InitTuple
	 */
	public Map<String, InitTuple> intersectionMap(
			Map<String, ResultTuple> r_map, Map<String, ResultTuple> l_map) {
		Map<String, InitTuple> map = new HashMap<String, InitTuple>();
		Set<String> r_key = r_map.keySet();
		Set<String> l_key = l_map.keySet();
		Set<String> key = new HashSet<String>();
		key.clear();
		key.addAll(r_key);
		key.retainAll(l_key);
		Iterator<String> it = key.iterator();
		while (it.hasNext()) {
			String word = it.next();
			// Set<Entry<String, Integer>> avSet =
			// r_map.get(word).av.entrySet();
			// Iterator<Entry<String, Integer>> it2 = avSet.iterator();
			// List<Integer> rav = new ArrayList<Integer>();
			// while (it2.hasNext()) {
			// rav.add(it2.next().getValue());
			// }
			// avSet.clear();
			// it2 = null;
			// avSet = l_map.get(word).av.entrySet();
			// it2 = avSet.iterator();
			// List<Integer> lav = new ArrayList<Integer>();
			// while (it2.hasNext()) {
			// lav.add(it2.next().getValue());
			// }
			if (word.length() > 1 && word.length() < 50)
				map.put(word,
						new InitTuple(r_map.get(word).frequncy,
								r_map.get(word).av, l_map.get(word).av));
		}
		return map;
	}

	/**
	 * 把文本正序与文本反序计算出的重复串取并集
	 * 
	 * @param r_map
	 * @param l_map
	 * @return key为文本中的重复子串，value为子串在文本中的频度
	 */
	public Map<String, Integer> unionMap(Map<String, ResultTuple> r_map,
			Map<String, ResultTuple> l_map) {
		Map<String, Integer> map = new HashMap<String, Integer>();
		Set<String> r_key = r_map.keySet();
		Set<String> l_key = l_map.keySet();
		Set<String> key = new HashSet<String>();
		key.clear();
		key.addAll(r_key);
		key.addAll(l_key);
		Iterator<String> it = key.iterator();
		while (it.hasNext()) {
			String word = it.next();
			if (r_map.get(word) == null) {
				map.put(word, l_map.get(word).frequncy);
			} else if (l_map.get(word) == null) {
				map.put(word, r_map.get(word).frequncy);
			} else {
				int fre = r_map.get(word).frequncy >= l_map.get(word).frequncy ? r_map
						.get(word).frequncy : l_map.get(word).frequncy; // 频度统计的遗留问题
				map.put(word, fre);
			}
		}
		return map;
	}

	/**
	 * 计算文本中重复字符子串的互信息 。
	 * 
	 * 互信息：值越大，说明重复子串与其子串的共现频度越大，词凝结度越高，更可能是新词，最大为1
	 * 
	 * @param mtext
	 * @return key为待分析的子串，value为重复串的mi值
	 */
	public void calcSMI(Multimap<String, Float> multiMap,
			Map<String, InitTuple> imap, Map<String, Integer> umap) {
		Set<Entry<String, InitTuple>> ikey = imap.entrySet();
		Iterator<Entry<String, InitTuple>> it = ikey.iterator();
		while (it.hasNext()) {
			String word = it.next().getKey();
			float count = imap.get(word).frequncy;
			if (word.length() > 1) {
				String l_substr = word.substring(0, word.length() - 1);
				String r_substr = word.substring(1, word.length());
				float l_count = umap.get(l_substr) == null ? count : umap
						.get(l_substr);
				float r_count = umap.get(r_substr) == null ? count : umap
						.get(r_substr);
				float deno = (l_count + r_count - count);
				multiMap.put(word, count);
				multiMap.put(word, new Float(count / deno));

			} else {
				multiMap.put(word, count);
				multiMap.put(word, new Float(1));
			}
		}
	}

	/**
	 * 计算文本中重复字符子串的邻接判别信息熵，取左右邻接判别信息熵的发最小值
	 * 
	 * AV（Accessor Variety：邻接判别）：计算AV所代表的信息熵，信息熵越大，说明字符串可衔接的邻居越多，字符串就越自由，越有可能是新词
	 * 
	 * @param iMap
	 *            key为待分析的子串 ，value为其av值
	 * @return
	 */
	public void calcAV(Multimap<String, Float> multiMap,
			Map<String, InitTuple> imap) {
		Set<Entry<String, InitTuple>> ikey = imap.entrySet();
		Iterator<Entry<String, InitTuple>> it = ikey.iterator();
		while (it.hasNext()) {
			String word = it.next().getKey();
			double fre = (double) imap.get(word).frequncy;
			double lav = 0.0f, rav = 0.0f;
			// int lav_num = 0, rav_num = 0;
			List<Integer> temp = imap.get(word).rav_list;
			// lav_num = temp.size();
			for (int i = 0; i < temp.size(); i++) {
				double p = temp.get(i) / fre;
				lav -= p * Math.log(p);
			}
			temp = imap.get(word).lav_list;
			// rav_num = temp.size();
			for (int i = 0; i < temp.size(); i++) {
				double p = temp.get(i) / fre;
				rav -= p * Math.log(p);
			}
			double av = lav > rav ? rav : lav;
			// int av_num = lav_num > rav_num ? rav_num : lav_num;
			multiMap.put(word, new Float(av));
		}
	}

	/**
	 * 计算重复串的独立性，和av类似，但是论文中描述的有问题，暂时不用。
	 * 
	 * @param multiMap
	 * @param imap
	 */

	public void calcIND(Multimap<String, Double> multiMap,
			Map<String, InitTuple> imap) {
		Set<Entry<String, InitTuple>> ikey = imap.entrySet();
		Iterator<Entry<String, InitTuple>> it = ikey.iterator();
		while (it.hasNext()) {
			String word = it.next().getKey();
			double fre = (double) imap.get(word).frequncy;
			double lind = 0.0, rind = 0.0;
			int lind_num = 0, rind_num = 0;
			List<Integer> temp = imap.get(word).rav_list;
			lind_num = temp.size();
			for (int i = 0; i < temp.size(); i++) {
				double p = temp.get(i) / fre;
				lind += p;
			}
			lind = (1 / lind_num) * lind;
			temp = imap.get(word).lav_list;
			rind_num = temp.size();
			for (int i = 0; i < temp.size(); i++) {
				double p = temp.get(i) / fre;
				rind += p;
			}
			rind = (1 / rind_num) * rind;
			double ind = (lind + rind) / 2;
			multiMap.put(word, new Double(ind));
		}
	}

	public Map<String, ResultTuple> getMap() {
		return map;
	}

	public boolean filterByRule(String word) {
		if (StringUtil.isNumberic(word.substring(0, 1)))
			return true;
		if(word.substring(0, 1).equals("的"))
			return true;
		return false;
	}
}